set.seed(895893)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyr)
library(broom)
library(papaja)
## Loading required package: tinylabels
This document outlines using the suggested procedure to simulate the number of participants necessary to accurately measure items. There are two key issues these ideas should address that we know about power:
Population. The data was simulated using the
rnorm function assuming a normal distribution for 30 scale
type items. Each population was simulated with 1000 data points. No
items were rounded for this simulation.
First, the scale of the data was manipulated by creating three sets of scales. The first scale was mimicked after small rating scales (i.e., 1-7 type style) using a \(\mu\) = 4 with a \(\sigma\) = .25 around the mean to create item mean variability. The second scale included a larger potential distribution of scores with a \(\mu\) = 50 (\(\sigma\) = 10) imitating a 0-100 scale. Last, the final scale included a \(\mu\) = 1000 (\(\sigma\) = 150) simulating a study that may include response latency data in the milliseconds. While there are many potential scales, these three represent a large number of potential variables in the social sciences. As we are suggesting variances as a key factor for estimating sample sizes, the scale of the data is influential on the amount of potential variance. Smaller ranges of data (1-7) cannot necessarily have the same variance as larger ranges (0-100).
Next, item variance heterogeneity was included by manipulating the potential \(\sigma\) for each individual item. For small scales, the \(\sigma\) = 2 points with a variability of .2, .4, and .8 for low, medium, and high heterogeneity in the variances between items. For the medium scale of data, \(\sigma\) = 25 with a variance of 4, 8, and 16. Last, for the large scale of data, \(\sigma\) = 400 with a variance of 50, 100, and 200 for heterogeneity.
# small potential variability overall, sort of 1-7ish scale
mu1 <- rnorm(30, 4, .25)
sigma1.1 <- rnorm(30, 2, .2)
sigma2.1 <- rnorm(30, 2, .4)
sigma3.1 <- rnorm(30, 2, .8)
# medium potential variability 0 to 100 scale
mu2 <- rnorm(30, 50, 10)
sigma1.2 <- rnorm(30, 25, 4)
sigma2.2 <- rnorm(30, 25, 8)
sigma3.2 <- rnorm(30, 25, 16)
while(sum(sigma3.2 < 0) > 0){
sigma3.2 <- rnorm(30, 25, 16)
}
# large potential variability in the 1000s scale
mu3 <- rnorm(30, 1000, 150)
sigma1.3 <- rnorm(30, 400, 50)
sigma2.3 <- rnorm(30, 400, 100)
sigma3.3 <- rnorm(30, 400, 200)
while(sum(sigma3.3 < 0) > 0){
sigma3.3 <- rnorm(30, 400, 200)
}
population1 <- data.frame(
item = rep(1:30, 1000*3),
scale = rep(1:3, each = 1000*30),
score = c(rnorm(1000*30, mean = mu1, sd = sigma1.1),
rnorm(1000*30, mean = mu2, sd = sigma1.2),
rnorm(1000*30, mean = mu3, sd = sigma1.3))
)
population2 <- data.frame(
item = rep(1:30, 1000*3),
scale = rep(1:3, each = 1000*30),
score = c(rnorm(1000*30, mean = mu1, sd = sigma2.1),
rnorm(1000*30, mean = mu2, sd = sigma2.2),
rnorm(1000*30, mean = mu3, sd = sigma2.3))
)
population3 <- data.frame(
item = rep(1:30, 1000*3),
scale = rep(1:3, each = 1000*30),
score = c(rnorm(1000*30, mean = mu1, sd = sigma3.1),
rnorm(1000*30, mean = mu2, sd = sigma3.2),
rnorm(1000*30, mean = mu3, sd = sigma3.3))
)
#evidence that they are simulated correctly
tapply(population1$score, list(population1$item,
population1$scale), mean)
## 1 2 3
## 1 3.902902 30.46267 775.0367
## 2 3.641517 46.46319 1067.1570
## 3 3.812100 42.52329 1295.5779
## 4 3.854610 58.00755 1096.0101
## 5 3.433522 42.00242 872.0807
## 6 4.143272 71.21973 1148.3532
## 7 4.107761 54.17529 1050.0229
## 8 4.434764 37.40733 958.1336
## 9 3.619174 40.06408 983.3803
## 10 4.006974 36.93354 1198.8293
## 11 3.267784 49.48155 1225.3493
## 12 3.813818 45.54308 1329.6380
## 13 4.269054 55.74713 968.1331
## 14 3.960559 41.97700 1093.0687
## 15 3.960783 56.92441 958.7610
## 16 3.839746 32.21330 1048.6222
## 17 3.966508 69.81192 835.1373
## 18 4.079169 40.03570 1258.1913
## 19 4.357796 50.84897 1199.0519
## 20 3.961464 44.77769 818.1981
## 21 3.965871 41.93207 1157.8404
## 22 4.347589 51.05360 874.5282
## 23 4.045330 38.66577 1068.7840
## 24 4.239570 52.51189 988.9018
## 25 3.721038 59.13788 832.1767
## 26 3.858812 50.79533 1092.1147
## 27 4.219286 48.45677 935.6302
## 28 3.862671 51.12075 863.7208
## 29 3.704449 62.41377 782.8347
## 30 4.492225 58.47688 806.9001
tapply(population1$score, list(population1$item,
population1$scale), sd)
## 1 2 3
## 1 1.890627 26.34446 447.9221
## 2 2.058239 18.48790 413.3759
## 3 1.948857 26.27541 417.0936
## 4 1.991223 24.31908 372.9159
## 5 1.969047 22.25882 315.2520
## 6 1.883817 19.87603 314.8351
## 7 1.833318 32.14773 350.0537
## 8 2.122209 21.25980 448.5052
## 9 2.306645 23.92401 402.1434
## 10 1.754506 24.73113 391.3724
## 11 1.587011 24.22367 365.5963
## 12 1.802461 30.18503 421.5994
## 13 1.999581 34.07362 371.7095
## 14 1.789389 27.55662 307.0195
## 15 2.007439 30.14664 431.6300
## 16 2.050786 25.18630 393.5207
## 17 2.241356 23.31799 380.9175
## 18 2.214826 23.91088 431.9195
## 19 2.043198 28.51435 461.5439
## 20 1.823680 27.28226 387.0149
## 21 2.218899 24.29106 356.7623
## 22 1.784586 27.56687 444.6985
## 23 1.787658 31.03734 387.7521
## 24 2.032558 31.50736 449.9648
## 25 2.135126 25.04701 413.4826
## 26 1.902208 27.32053 383.7975
## 27 2.167401 25.17818 291.1477
## 28 2.007897 22.81182 380.9693
## 29 2.293348 27.51165 293.8116
## 30 1.541930 22.79358 378.0659
tapply(population2$score, list(population2$item,
population2$scale), mean)
## 1 2 3
## 1 3.950501 33.00300 780.0960
## 2 3.750543 47.68966 1089.9544
## 3 3.735677 42.45975 1298.4786
## 4 3.988222 58.47058 1094.0806
## 5 3.334382 41.50217 843.9473
## 6 4.158273 68.92020 1135.7110
## 7 4.175987 53.60701 1059.9587
## 8 4.319914 39.82013 966.4655
## 9 3.580827 40.23406 971.7099
## 10 4.110682 36.55129 1207.6237
## 11 3.259170 49.53321 1227.2703
## 12 3.768975 44.31530 1325.8837
## 13 4.254734 56.24424 960.0673
## 14 3.929969 43.36030 1101.0468
## 15 4.083672 56.14094 944.9630
## 16 3.908679 32.22729 1046.5041
## 17 3.961111 71.02240 822.8286
## 18 4.106718 39.97576 1255.1825
## 19 4.325670 51.90323 1202.2096
## 20 4.054181 45.06967 803.6126
## 21 4.059961 40.96535 1149.1808
## 22 4.227156 51.07047 866.0613
## 23 4.048451 37.51939 1061.6135
## 24 4.116550 52.61103 975.8781
## 25 3.567186 57.94929 818.8236
## 26 3.890774 49.19034 1126.9975
## 27 4.290088 50.23184 955.9890
## 28 3.834475 50.37329 852.1188
## 29 3.685025 60.27283 772.6668
## 30 4.433935 57.36398 798.2312
tapply(population2$score, list(population2$item,
population2$scale), sd)
## 1 2 3
## 1 2.763139 40.60330 272.9881
## 2 1.390085 31.57465 376.1165
## 3 1.577165 16.80232 217.6260
## 4 2.255374 10.97166 265.3098
## 5 2.409540 39.88434 368.8119
## 6 1.674612 29.04493 499.2654
## 7 2.026715 16.81706 203.5280
## 8 1.820940 41.46035 367.9732
## 9 2.493294 36.34031 270.3850
## 10 2.016523 17.23096 389.4101
## 11 2.468095 22.94024 463.2799
## 12 1.551968 31.83887 396.8280
## 13 2.105696 15.12426 198.6223
## 14 1.528888 31.08692 564.4742
## 15 2.277393 29.09624 329.6530
## 16 1.646159 27.38014 394.2811
## 17 2.355252 50.17814 385.7395
## 18 1.533480 13.74537 396.6787
## 19 2.019509 37.31631 425.8186
## 20 1.924035 27.78527 280.9045
## 21 1.853170 28.62314 410.2052
## 22 1.998172 33.01822 463.8064
## 23 2.004897 23.36831 378.2983
## 24 2.744461 27.39842 528.3982
## 25 2.224172 21.45756 546.9542
## 26 1.384873 16.25205 288.8152
## 27 2.655162 41.85640 203.0837
## 28 1.693612 19.18822 518.2790
## 29 1.962601 39.87810 524.9772
## 30 1.769749 23.70109 435.7949
tapply(population3$score, list(population3$item,
population3$scale), mean)
## 1 2 3
## 1 3.855587 32.97237 762.9452
## 2 3.868187 46.99772 1089.5423
## 3 3.845957 43.32952 1292.2083
## 4 3.959157 57.84889 1102.7523
## 5 3.584825 41.60252 893.0890
## 6 3.963773 69.25035 1163.2111
## 7 4.107680 55.74294 1053.3420
## 8 4.215722 36.27618 952.7271
## 9 3.702241 40.65176 1012.1138
## 10 3.993711 36.36285 1211.2836
## 11 3.269975 49.06809 1229.8295
## 12 3.678456 44.27366 1350.8253
## 13 4.233037 56.38444 981.2822
## 14 3.912706 43.10341 1103.4023
## 15 4.054028 55.62390 959.7905
## 16 3.958513 33.88419 1069.0534
## 17 3.997336 70.16391 831.0627
## 18 4.163447 38.04008 1235.3849
## 19 4.243718 51.63930 1180.5805
## 20 4.011720 45.24226 813.5420
## 21 4.101582 41.20121 1122.5218
## 22 4.183665 50.81866 862.3160
## 23 4.171506 39.85052 1064.2565
## 24 4.020276 51.25713 985.8166
## 25 3.849307 58.62198 843.9054
## 26 3.807178 49.42927 1117.0703
## 27 4.115374 49.08685 962.5971
## 28 4.022901 49.30474 885.9961
## 29 3.832931 57.22959 804.6436
## 30 4.582119 57.19845 801.0198
tapply(population3$score, list(population3$item,
population3$scale), sd)
## 1 2 3
## 1 3.2769848 30.592868 472.78086
## 2 2.7331542 27.606291 219.31130
## 3 2.2973465 11.310145 259.32081
## 4 2.3507781 23.973472 180.07712
## 5 1.7898735 4.842688 555.20067
## 6 1.9010021 19.076503 331.65811
## 7 3.3702103 38.460999 393.65848
## 8 1.9120117 27.669388 490.82736
## 9 3.2527780 42.160159 446.85460
## 10 2.0874351 32.874833 749.43323
## 11 2.7977082 20.573106 290.36279
## 12 3.3006312 31.424359 507.40278
## 13 1.1770792 39.776679 514.58535
## 14 2.3335247 15.357720 481.37062
## 15 0.7675617 43.340551 74.02248
## 16 1.5466065 46.567773 372.76795
## 17 1.7805748 7.299641 229.58366
## 18 2.6329110 41.760361 545.84021
## 19 4.0269547 16.713839 601.87632
## 20 0.8437944 15.940341 375.00585
## 21 1.2479703 26.052144 502.13185
## 22 2.2823698 33.476917 562.53933
## 23 2.6252197 42.923010 262.92805
## 24 5.1654318 19.085865 392.16348
## 25 1.6935013 33.603440 313.33447
## 26 1.4942078 19.737497 184.62965
## 27 2.7674848 9.092104 213.00917
## 28 2.8525696 36.119084 212.09303
## 29 1.8855775 46.882515 404.90381
## 30 1.8449062 11.442060 446.55408
Samples. Each population was then sampled as if a researcher was conducting a pilot study. The sample sizes started at 20 participants per item increasing in units of 5 up to 100 participants.
# create pilot samples from 20 to 100
samples1 <- samples2 <- samples3 <- list()
sizes <- c(20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 100)
for (i in 1:length(sizes)){
samples1[[i]] <- population1 %>%
group_by(item, scale) %>%
slice_sample(n = sizes[i])
samples2[[i]] <- population2 %>%
group_by(item, scale) %>%
slice_sample(n = sizes[i])
samples3[[i]] <- population3 %>%
group_by(item, scale) %>%
slice_sample(n = sizes[i])
}
AIPE Criterions. The standard errors of each item were calculated to mimic the AIPE procedure of finding an appropriately small confidence interval, as standard error functions as the main component in the formula for normal distribution confidence intervals. Standard errors were calculated at each decile of the items up to 90% (0% smallest SE, 10% …, 90% largest SE). The lower deciles would represent a strict criterion for accurate measurement, as many items would need smaller SEs to meet AIPE goals, while the higher deciles would represent less strict criterions for AIPE goals.
# calculate the SEs and the cutoff scores
SES1 <- SES2 <- SES3 <- list()
cutoffs1 <- cutoffs2 <- cutoffs3 <- list()
sd_items1 <- sd_items2 <- sd_items3 <- list()
for (i in 1:length(samples1)){
sd_items1[[i]] <- samples1[[i]] %>% group_by(item, scale) %>%
summarize(sd = sd(score), .groups = "keep") %>%
ungroup() %>% group_by(scale) %>% summarize(sd_item = sd(sd))
sd_items2[[i]] <- samples2[[i]] %>% group_by(item, scale) %>%
summarize(sd = sd(score), .groups = "keep") %>%
ungroup() %>% group_by(scale) %>% summarize(sd_item = sd(sd))
sd_items3[[i]] <- samples3[[i]] %>% group_by(item, scale) %>%
summarize(sd = sd(score), .groups = "keep") %>%
ungroup() %>% group_by(scale) %>% summarize(sd_item = sd(sd))
SES1[[i]] <- tapply(samples1[[i]]$score,
list(samples1[[i]]$item,
samples1[[i]]$scale),
function (x){ sd(x)/sqrt(length(x))})
SES2[[i]] <- tapply(samples2[[i]]$score,
list(samples2[[i]]$item,
samples2[[i]]$scale),
function (x){ sd(x)/sqrt(length(x))})
SES3[[i]] <- tapply(samples3[[i]]$score,
list(samples2[[i]]$item,
samples2[[i]]$scale),
function (x){ sd(x)/sqrt(length(x))})
cutoffs1[[i]] <- apply(as.data.frame(SES1[[i]]), 2,
quantile,
probs = seq(0, .9, by = .1))
cutoffs2[[i]] <- apply(as.data.frame(SES2[[i]]), 2,
quantile,
probs = seq(0, .9, by = .1))
cutoffs3[[i]] <- apply(as.data.frame(SES3[[i]]), 2,
quantile,
probs = seq(0, .9, by = .1))
}
In this section, we simulate what a researcher might do if they follow our suggested application of AIPE to sample size planning based on well measured items. Assuming each pilot sample represents a dataset a researcher has collected, we will simulate samples of 20 to 500 to determine what the new sample size suggestion would be. We assume that samples over 500 may be considered too large for many researchers who do not work in teams or have participant funds. The standard error of each item was calculated for each suggested sample size by pilot sample size by population type.
# sequence of sample sizes to try
samplesize_values <- seq(20, 500, 5)
# place to store everything
sampled_values1 <- sampled_values2 <- sampled_values3 <- list()
# loop over the samples
for (i in 1:length(samples1)){
# create a blank table for us to save the values in
sim_table1 <- matrix(NA,
nrow = length(samplesize_values),
ncol = 30*3)
# make it a data frame
sim_table1 <- sim_table2 <- sim_table3 <- as.data.frame(sim_table1)
# add a place for sample size values
sim_table1$sample_size <- sim_table2$sample_size <- sim_table3$sample_size <- NA
# loop over pilot sample sizes
for (q in 1:length(samplesize_values)){
# temp dataframe that samples and summarizes
temp <- samples1[[i]] %>%
group_by(item, scale) %>%
slice_sample(n = samplesize_values[q], replace = T) %>%
summarize(se = sd(score)/sqrt(length(score)),
.groups = "keep")
sim_table1[q, 1:90] <- temp$se
sim_table1[q, 91] <- samplesize_values[q]
temp <- samples2[[i]] %>%
group_by(item, scale) %>%
slice_sample(n = samplesize_values[q], replace = T) %>%
summarize(se = sd(score)/sqrt(length(score)),
.groups = "keep")
sim_table2[q, 1:90] <- temp$se
sim_table2[q, 91] <- samplesize_values[q]
temp <- samples3[[i]] %>%
group_by(item, scale) %>%
slice_sample(n = samplesize_values[q], replace = T) %>%
summarize(se = sd(score)/sqrt(length(score)),
.groups = "keep")
sim_table3[q, 1:90] <- temp$se
sim_table3[q, 91] <- samplesize_values[q]
} # end pilot sample loop
sampled_values1[[i]] <- sim_table1
sampled_values2[[i]] <- sim_table2
sampled_values3[[i]] <- sim_table3
} # end all sample loop
Next, the percent of items that fall below the cutoff scores, and thus, would be considered “well-measured” were calculated for each decile by sample.
summary_list1 <- summary_list2 <- summary_list3 <- list()
for (i in 1:length(sampled_values1)){
# summary list 1 ----
summary_list1[[i]] <- sampled_values1[[i]] %>%
pivot_longer(cols = -c(sample_size)) %>%
rename(item = name, se = value) %>%
mutate(scale = rep(1:3, 30*length(samplesize_values))) %>%
mutate(item = rep(rep(1:30, each = 3), length(samplesize_values)))
# cut offs for 1
temp1.1 <- summary_list1[[i]] %>%
filter(scale == "1") %>%
group_by(sample_size, scale) %>%
summarize(Percent_Below0 = sum(se <= cutoffs1[[i]][1, 1])/30,
Percent_Below10 = sum(se <= cutoffs1[[i]][2, 1])/30,
Percent_Below20 = sum(se <= cutoffs1[[i]][3, 1])/30,
Percent_Below30 = sum(se <= cutoffs1[[i]][4, 1])/30,
Percent_Below40 = sum(se <= cutoffs1[[i]][5, 1])/30,
Percent_Below50 = sum(se <= cutoffs1[[i]][6, 1])/30,
Percent_Below60 = sum(se <= cutoffs1[[i]][7, 1])/30,
Percent_Below70 = sum(se <= cutoffs1[[i]][8, 1])/30,
Percent_Below80 = sum(se <= cutoffs1[[i]][9, 1])/30,
Percent_Below90 = sum(se <= cutoffs1[[i]][10, 1])/30,
.groups = "keep") %>%
mutate(original_n = sizes[i],
source = "low")
temp1.2 <- summary_list1[[i]] %>%
filter(scale == "2") %>%
group_by(sample_size, scale) %>%
summarize(Percent_Below0 = sum(se <= cutoffs1[[i]][1, 2])/30,
Percent_Below10 = sum(se <= cutoffs1[[i]][2, 2])/30,
Percent_Below20 = sum(se <= cutoffs1[[i]][3, 2])/30,
Percent_Below30 = sum(se <= cutoffs1[[i]][4, 2])/30,
Percent_Below40 = sum(se <= cutoffs1[[i]][5, 2])/30,
Percent_Below50 = sum(se <= cutoffs1[[i]][6, 2])/30,
Percent_Below60 = sum(se <= cutoffs1[[i]][7, 2])/30,
Percent_Below70 = sum(se <= cutoffs1[[i]][8, 2])/30,
Percent_Below80 = sum(se <= cutoffs1[[i]][9, 2])/30,
Percent_Below90 = sum(se <= cutoffs1[[i]][10, 2])/30,
.groups = "keep") %>%
mutate(original_n = sizes[i],
source = "low")
temp1.3 <- summary_list1[[i]] %>%
filter(scale == "3") %>%
group_by(sample_size, scale) %>%
summarize(Percent_Below0 = sum(se <= cutoffs1[[i]][1, 3])/30,
Percent_Below10 = sum(se <= cutoffs1[[i]][2, 3])/30,
Percent_Below20 = sum(se <= cutoffs1[[i]][3, 3])/30,
Percent_Below30 = sum(se <= cutoffs1[[i]][4, 3])/30,
Percent_Below40 = sum(se <= cutoffs1[[i]][5, 3])/30,
Percent_Below50 = sum(se <= cutoffs1[[i]][6, 3])/30,
Percent_Below60 = sum(se <= cutoffs1[[i]][7, 3])/30,
Percent_Below70 = sum(se <= cutoffs1[[i]][8, 3])/30,
Percent_Below80 = sum(se <= cutoffs1[[i]][9, 3])/30,
Percent_Below90 = sum(se <= cutoffs1[[i]][10, 3])/30,
.groups = "keep") %>%
mutate(original_n = sizes[i],
source = "low")
#rejoin
summary_list1[[i]] <- bind_rows(temp1.1, temp1.2, temp1.3)
# summary list 2 ----
summary_list2[[i]] <- sampled_values2[[i]] %>%
pivot_longer(cols = -c(sample_size)) %>%
rename(item = name, se = value) %>%
mutate(scale = rep(1:3, 30*length(samplesize_values))) %>%
mutate(item = rep(rep(1:30, each = 3), length(samplesize_values)))
# cut offs for 2
temp2.1 <- summary_list2[[i]] %>%
filter(scale == "1") %>%
group_by(sample_size, scale) %>%
summarize(Percent_Below0 = sum(se <= cutoffs2[[i]][1, 1])/30,
Percent_Below10 = sum(se <= cutoffs2[[i]][2, 1])/30,
Percent_Below20 = sum(se <= cutoffs2[[i]][3, 1])/30,
Percent_Below30 = sum(se <= cutoffs2[[i]][4, 1])/30,
Percent_Below40 = sum(se <= cutoffs2[[i]][5, 1])/30,
Percent_Below50 = sum(se <= cutoffs2[[i]][6, 1])/30,
Percent_Below60 = sum(se <= cutoffs2[[i]][7, 1])/30,
Percent_Below70 = sum(se <= cutoffs2[[i]][8, 1])/30,
Percent_Below80 = sum(se <= cutoffs2[[i]][9, 1])/30,
Percent_Below90 = sum(se <= cutoffs2[[i]][10, 1])/30,
.groups = "keep") %>%
mutate(original_n = sizes[i],
source = "med")
temp2.2 <- summary_list2[[i]] %>%
filter(scale == "2") %>%
group_by(sample_size, scale) %>%
summarize(Percent_Below0 = sum(se <= cutoffs2[[i]][1, 2])/30,
Percent_Below10 = sum(se <= cutoffs2[[i]][2, 2])/30,
Percent_Below20 = sum(se <= cutoffs2[[i]][3, 2])/30,
Percent_Below30 = sum(se <= cutoffs2[[i]][4, 2])/30,
Percent_Below40 = sum(se <= cutoffs2[[i]][5, 2])/30,
Percent_Below50 = sum(se <= cutoffs2[[i]][6, 2])/30,
Percent_Below60 = sum(se <= cutoffs2[[i]][7, 2])/30,
Percent_Below70 = sum(se <= cutoffs2[[i]][8, 2])/30,
Percent_Below80 = sum(se <= cutoffs2[[i]][9, 2])/30,
Percent_Below90 = sum(se <= cutoffs2[[i]][10, 2])/30,
.groups = "keep") %>%
mutate(original_n = sizes[i],
source = "med")
temp2.3 <- summary_list2[[i]] %>%
filter(scale == "3") %>%
group_by(sample_size, scale) %>%
summarize(Percent_Below0 = sum(se <= cutoffs2[[i]][1, 3])/30,
Percent_Below10 = sum(se <= cutoffs2[[i]][2, 3])/30,
Percent_Below20 = sum(se <= cutoffs2[[i]][3, 3])/30,
Percent_Below30 = sum(se <= cutoffs2[[i]][4, 3])/30,
Percent_Below40 = sum(se <= cutoffs2[[i]][5, 3])/30,
Percent_Below50 = sum(se <= cutoffs2[[i]][6, 3])/30,
Percent_Below60 = sum(se <= cutoffs2[[i]][7, 3])/30,
Percent_Below70 = sum(se <= cutoffs2[[i]][8, 3])/30,
Percent_Below80 = sum(se <= cutoffs2[[i]][9, 3])/30,
Percent_Below90 = sum(se <= cutoffs2[[i]][10, 3])/30,
.groups = "keep") %>%
mutate(original_n = sizes[i],
source = "med")
#rejoin
summary_list2[[i]] <- bind_rows(temp2.1, temp2.2, temp2.3)
# summary list 3 ----
summary_list3[[i]] <- sampled_values3[[i]] %>%
pivot_longer(cols = -c(sample_size)) %>%
rename(item = name, se = value) %>%
mutate(scale = rep(1:3, 30*length(samplesize_values))) %>%
mutate(item = rep(rep(1:30, each = 3), length(samplesize_values)))
# cut offs for 3
temp3.1 <- summary_list3[[i]] %>%
filter(scale == "1") %>%
group_by(sample_size, scale) %>%
summarize(Percent_Below0 = sum(se <= cutoffs3[[i]][1, 1])/30,
Percent_Below10 = sum(se <= cutoffs3[[i]][2, 1])/30,
Percent_Below20 = sum(se <= cutoffs3[[i]][3, 1])/30,
Percent_Below30 = sum(se <= cutoffs3[[i]][4, 1])/30,
Percent_Below40 = sum(se <= cutoffs3[[i]][5, 1])/30,
Percent_Below50 = sum(se <= cutoffs3[[i]][6, 1])/30,
Percent_Below60 = sum(se <= cutoffs3[[i]][7, 1])/30,
Percent_Below70 = sum(se <= cutoffs3[[i]][8, 1])/30,
Percent_Below80 = sum(se <= cutoffs3[[i]][9, 1])/30,
Percent_Below90 = sum(se <= cutoffs3[[i]][10, 1])/30,
.groups = "keep") %>%
mutate(original_n = sizes[i],
source = "high")
temp3.2 <- summary_list3[[i]] %>%
filter(scale == "2") %>%
group_by(sample_size, scale) %>%
summarize(Percent_Below0 = sum(se <= cutoffs3[[i]][1, 2])/30,
Percent_Below10 = sum(se <= cutoffs3[[i]][2, 2])/30,
Percent_Below20 = sum(se <= cutoffs3[[i]][3, 2])/30,
Percent_Below30 = sum(se <= cutoffs3[[i]][4, 2])/30,
Percent_Below40 = sum(se <= cutoffs3[[i]][5, 2])/30,
Percent_Below50 = sum(se <= cutoffs3[[i]][6, 2])/30,
Percent_Below60 = sum(se <= cutoffs3[[i]][7, 2])/30,
Percent_Below70 = sum(se <= cutoffs3[[i]][8, 2])/30,
Percent_Below80 = sum(se <= cutoffs3[[i]][9, 2])/30,
Percent_Below90 = sum(se <= cutoffs3[[i]][10, 2])/30,
.groups = "keep") %>%
mutate(original_n = sizes[i],
source = "high")
temp3.3 <- summary_list3[[i]] %>%
filter(scale == "3") %>%
group_by(sample_size, scale) %>%
summarize(Percent_Below0 = sum(se <= cutoffs3[[i]][1, 3])/30,
Percent_Below10 = sum(se <= cutoffs3[[i]][2, 3])/30,
Percent_Below20 = sum(se <= cutoffs3[[i]][3, 3])/30,
Percent_Below30 = sum(se <= cutoffs3[[i]][4, 3])/30,
Percent_Below40 = sum(se <= cutoffs3[[i]][5, 3])/30,
Percent_Below50 = sum(se <= cutoffs3[[i]][6, 3])/30,
Percent_Below60 = sum(se <= cutoffs3[[i]][7, 3])/30,
Percent_Below70 = sum(se <= cutoffs3[[i]][8, 3])/30,
Percent_Below80 = sum(se <= cutoffs3[[i]][9, 3])/30,
Percent_Below90 = sum(se <= cutoffs3[[i]][10, 3])/30,
.groups = "keep") %>%
mutate(original_n = sizes[i],
source = "high")
#rejoin
summary_list3[[i]] <- bind_rows(temp3.1, temp3.2, temp3.3)
}
summary_DF <- bind_rows(summary_list1,
summary_list2,
summary_list3)
From this data, we pinpoint the smallest suggested sample size at which 80%, 85%, 90%, and 95% of the items fall below the cutoff criterion. These values were chosen as popular measures of “power” in which one could determine the minimum suggested sample size (potentially 80% of items) and the maximum suggested sample size (potentially 90%).
summary_long_80 <- summary_DF %>%
pivot_longer(cols = -c(sample_size, original_n, source, scale)) %>%
filter(value >= .80) %>%
arrange(sample_size, original_n, source, scale, name) %>%
group_by(original_n, name, source, scale) %>%
slice_head(n = 1) %>%
mutate(power = 80)
summary_long_85 <- summary_DF %>%
pivot_longer(cols = -c(sample_size, original_n, source, scale)) %>%
filter(value >= .85) %>%
arrange(sample_size, original_n, source, scale, name) %>%
group_by(original_n, name, source, scale) %>%
slice_head(n = 1) %>%
mutate(power = 85)
summary_long_90 <- summary_DF %>%
pivot_longer(cols = -c(sample_size, original_n, source, scale)) %>%
filter(value >= .90) %>%
arrange(sample_size, original_n, source, scale, name) %>%
group_by(original_n, name, source, scale) %>%
slice_head(n = 1) %>%
mutate(power = 90)
summary_long_95 <- summary_DF %>%
pivot_longer(cols = -c(sample_size, original_n, source, scale)) %>%
filter(value >= .95) %>%
arrange(sample_size, original_n, source, scale, name) %>%
group_by(original_n, name, source, scale) %>%
slice_head(n = 1) %>%
mutate(power = 95)
summary_long <- rbind(summary_long_80,
summary_long_85,
summary_long_90,
summary_long_95)
summary_long$source <- factor(summary_long$source,
levels = c("low", "med", "high"),
labels = c("Low Variance", "Medium Variance", "High Variance"))
summary_long$scale2 <- factor(summary_long$scale,
levels = c(1:3),
labels = c("Small Scale", "Medium Scale", "Large Scale"))
sd_items <- bind_rows(sd_items1, sd_items2, sd_items3)
sd_items$original_n <- rep(rep(sizes, each = 3), 3)
sd_items$source <- rep(c("Low Variance", "Medium Variance", "High Variance"), each = 3*length(sizes))
summary_long <- summary_long %>%
full_join(sd_items,
by = c("original_n" = "original_n", "scale" = "scale",
"source" = "source"))
summary_long$name <- gsub("Percent_Below", "Decile ", summary_long$name)
We examined if this procedure is sensitive to differences in item heterogeneity, as we should expect to collect larger samples if we wish to have a large number of items reach a threshold of acceptable variance; potentially, assuring we could average them if a researcher did not wish to use a more complex analysis such as multilevel modeling.
The figure below illustrates the potential minimum sample size for 80% of items to achieve a desired cutoff score. The black dots denote the original sample size against the suggested sample size. By comparing the facets, we can determine that our suggested procedure does capture the differences in heterogeneity. As heterogeneity increases in item variances, the proposed sample size also increases, especially at stricter cutoffs. Missing cutoff points where sample sizes proposed would be higher than 500.
summary_long$source <- factor(summary_long$source,
levels = c("Low Variance", "Medium Variance", "High Variance"))
ggplot(summary_long %>% filter(power == 80), aes(original_n, sample_size, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ source*scale2) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
In our second question, we examined if the suggested procedure was sensitive to the amount of information present in the pilot data. Larger pilot data is more informative, and therefore, we should expect a lower projected sample size. As shown in the figure below for only the low variability and small scale data, we do not find this effect. These simplistic simulations from the pilot data would nearly always suggest a larger sample size - mostly in a linear trend increasing with sample sizes. This result comes from the nature of the procedure - if we base our estimates on some SE cutoff, we will almost always need a bit more people for items to meet those goals. This result does not achieve our second goal.
ggplot(summary_long %>% filter(source == "Low Variance") %>% filter(scale == 1), aes(original_n, sample_size, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
Therefore, we suggest using a correction factor on the simulation procedure to account for the known asymptotic nature of power (i.e., at larger sample sizes power increases level off). For this function, we combined a correction factor for upward biasing of effect sizes (Hedges’ correction) with the formula for exponential decay calculations. The decay factor is calculated as follows:
\[ 1 - \sqrt{\frac{N_{pilot} - min(N_{sim})}{N_{pilot}}}^{log_2(N_{pilot})}\] \(N_{pilot}\) indicates the sample size of the pilot data minus the minimum sample size for simulation to ensure that the smallest sample sizes do not decay (e.g., the formula zeroes out). This value is raised to the power of \(log_2\) of the sample size of the pilot data, which decreases the impact of the decay to smaller increments for increasing sample sizes. This value is then multiplied by the proposed sample size. As show in the figure below, this correction factor produces the desired quality of maintaining that small pilot studies should increase sample size, and that sample size suggestions level off as pilot study data sample size increases.
decay <- 1-sqrt((summary_long$original_n-20)/summary_long$original_n)^log2(summary_long$original_n)
summary_long$new_sample <- summary_long$sample_size*decay
ggplot(summary_long %>% filter(source == "Low Variance") %>% filter(scale == 1), aes(original_n, new_sample, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
We have portrayed that this procedure, with a correction factor, can perform as desired. However, within real scenarios, researchers will only have one pilot sample, not the various simulated samples shown above. What should the researcher do to correct their sample size on their own pilot data?
To explore if we could recover the new suggested sample size from data a researcher would have, we used linear models to create a formula for calculation. First, the corrected sample size was predicted by the original suggested sample size. Next, the standard deviation of the item standard deviations was added to the equation to recreate heterogeneity estimates. Last, we included the pilot sample size. The scale of the data is embedded into the standard deviation of the items (r = 0.81), and therefore, this variable was not included separately. The standard deviation of the item’s standard deviation embeds both heterogeneity and potential scale size.
user_model <- lm(new_sample ~ sample_size, data = summary_long)
user_print <- apa_print(user_model)
user_model2 <- lm(new_sample ~ sample_size + sd_item, data = summary_long)
user_print2 <- apa_print(user_model2)
user_model3 <- lm(new_sample ~ sample_size + sd_item + original_n, data = summary_long)
user_print3 <- apa_print(user_model3)
change_table <- tidy(anova(user_model, user_model2, user_model3))
The first model using original sample size to predict new sample size was significant, \(R^2 = .89\), 90% CI \([0.89, 0.90]\), \(F(1, 5,666) = 48,042.64\), \(p < .001\), capturing nearly 90% of the variance, \(b = 0.62\), 95% CI \([0.62, 0.63]\). The second model with item standard deviation was better then the first model F(1, 5665) = 55.21, p < .001, \(R^2 = .89\), 90% CI \([0.89, 0.90]\). The item standard deviation predictor was significant, \(b = 0.02\), 95% CI \([0.01, 0.03]\), \(t(5665) = 4.54\), \(p < .001\). The addition of the original pilot sample size was also significant, F(1, 5664) = 9529.83, p < .001, \(R^2 = .96\), 90% CI \([0.96, 0.96]\).
As shown in the table below for our final model, the new suggested sample size is proportional to the original suggested sample size (i.e., b < 1), which reduces the sample size suggestion. As variability increases, the suggested sample size also increases to capture differences in heterogeneity shown above; however, this predictor is not significant in the final model, and only contributes a small portion of overall variance. Last, in order to correct for large pilot data, the original pilot sample size decreases the new suggested sample size. This formula approximation captures 96% of the variance in sample size scores and should allow a researcher to estimate based on their own data.
apa_table(tidy(user_model3))
| term | estimate | std.error | statistic | p.value |
|---|---|---|---|---|
| (Intercept) | 39.27 | 0.44 | 89.84 | 0.00 |
| sample_size | 0.70 | 0.00 | 366.67 | 0.00 |
| sd_item | 0.00 | 0.00 | 0.95 | 0.34 |
| original_n | -0.69 | 0.01 | -97.62 | 0.00 |
by_cutoff <- list()
R2 <- list()
for (cutoff in unique(summary_long$name)){
by_cutoff[[cutoff]] <- lm(new_sample ~ sample_size + sd_item + original_n, data = summary_long %>% filter(name == cutoff))
R2[cutoff] <- summary(by_cutoff[[cutoff]])$r.squared
}
R2
## $`Decile 0`
## [1] 0.966061
##
## $`Decile 10`
## [1] 0.9631429
##
## $`Decile 20`
## [1] 0.9671475
##
## $`Decile 30`
## [1] 0.9675296
##
## $`Decile 40`
## [1] 0.9669199
##
## $`Decile 50`
## [1] 0.9647775
##
## $`Decile 60`
## [1] 0.9642954
##
## $`Decile 70`
## [1] 0.9642689
##
## $`Decile 80`
## [1] 0.9624604
##
## $`Decile 90`
## [1] 0.9650027
Last, we examine the question of an appropriate SE decile. All graphs for power, heterogeneity, scale, and correction are presented below. First, the 0%, 10%, and 20% deciles are likely too restrictive, providing very large estimates that do not always find a reasonable sample size in proportion to the pilot sample size, scale, and heterogeneity.If we examine the \(R^2\) values for each decile of our regression equation separately, we find that the 50% (0.965) represents the best match to our corrected sample size suggestions. The 50% decile, in the corrected format, appears to meet all goals: 1) increases with heterogeneity and scale of data, and 2) higher suggested values for small original samples and a leveling effect at larger pilot data.
The formula for finding the corrected sample size using a 50% decile is: \(Final Sample = 39.269 + 0.700 \times X_{Suggested Sample Size} + 0.003 \times X_{SD Items} + -0.695 \times X_{Pilot Sample Size}\). The suggested sample size will be estimated from the 80%, 85%, 90%, or 95% selection at the 50% decile as shown above. The item SD can be calculated directly from the data, and the pilot sample size is the sample size of the data from which a researcher is simulating their samples.
ggplot(summary_long %>% filter(source == "Low Variance"), aes(original_n, sample_size, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ scale2*power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
ggplot(summary_long %>% filter(source == "Low Variance"), aes(original_n, new_sample, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ scale2*power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
ggplot(summary_long %>% filter(source == "Medium Variance"), aes(original_n, sample_size, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ scale2*power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
summary_long$new_sample <- summary_long$sample_size*decay
ggplot(summary_long %>% filter(source == "Medium Variance"), aes(original_n, new_sample, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ scale2*power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
ggplot(summary_long %>% filter(source == "High Variance"), aes(original_n, sample_size, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ scale2*power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")
ggplot(summary_long %>% filter(source == "High Variance"), aes(original_n, new_sample, color = name)) +
geom_point() +
geom_point(aes(original_n, original_n), color = "black") +
geom_line() +
theme_classic() +
facet_wrap(~ scale2*power) +
xlab("Original Sample Size") +
ylab("Suggested Sample Size") +
scale_color_discrete(name = "Cutoff Score")